1 Data and Functions

library(SingleCellExperiment)
library(scater)
library(scran)
library(ggplot2)
library(gridExtra)
library(ggthemes)
library(pheatmap)
library(reshape2)
library(ggpubr)
errClass <- c("correct", "correctly unassigned",  "intermediate", "incorrectly unassigned",
              "error assigned", "misclassified")

ClassifyError <- function(cellTypes_pred, cellTypes_test, cellTypes_train){
  if(length(cellTypes_pred)!=length(cellTypes_test)){
    stop("wrong input")
  }
  train_ref <- unique(cellTypes_train)
  test_ref <- unique(cellTypes_test)
  res <- sapply(1:length(cellTypes_pred), function(i){
    if(cellTypes_test[i]%in%train_ref){
      if(cellTypes_pred[i] %in% c("unassigned", "Unassigned")){
        "incorrectly unassigned"
      }else if(cellTypes_pred[i] == "intermediate"){
        "intermediate"
      }else{
        if(cellTypes_test[i] == cellTypes_pred[i]){
          "correct"
        }else if(grepl(cellTypes_test[i], cellTypes_pred[i])|grepl("Node", cellTypes_pred[i])){
          "intermediate"
        }
        else{
          "misclassified"
        }
      }
    }else{
      if(cellTypes_pred[i] %in% c("unassigned","Unassigned")){
        "correctly unassigned"
      }else{
        "error assigned"
      }
    }
  })
  return(res)
}

freqError <- function(x){
  x <- factor(x, levels = errClass)
  table(x)/length(x)
}
load("data/pancreasSeven.RData")


baron <- baron[,grep("human", colnames(baron))]

baron
## class: SingleCellExperiment 
## dim: 35948 8569 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(8569): human1_lib1.final_cell_0001 human1_lib1.final_cell_0002
##   ... human4_lib3.final_cell_0700 human4_lib3.final_cell_0701
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
muraro1
## class: SingleCellExperiment 
## dim: 35948 1008 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(1008): D2ex_1 D2ex_2 ... D17TGFB_94 D17TGFB_95
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
muraro2
## class: SingleCellExperiment 
## dim: 35948 2122 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(2122): D28.1_1 D28.1_2 ... D30.8_93 D30.8_94
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
segerstolpe
## class: SingleCellExperiment 
## dim: 35948 2207 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(2207): HP1502401_H13 HP1502401_J14 ... HP1526901T2D_N8
##   HP1526901T2D_A8
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
lawlor
## class: SingleCellExperiment 
## dim: 35948 617 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(617): 10th_C11_S96 10th_C13_S61 ... 9th_C94_S83 9th_C9_S13
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
xin
## class: SingleCellExperiment 
## dim: 35948 1474 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(1474): SRR3541305 SRR3541306 ... SRR3542902 SRR3542903
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
wang
## class: SingleCellExperiment 
## dim: 35948 501 
## metadata(0):
## assays(2): counts logcounts
## rownames(35948): SGIP1 AZIN2 ... MIR4776-2 IGHV4-80
## rowData names(0):
## colnames(501): SRR3649793 SRR3649794 ... SRR3650977 SRR3650979
## colData names(2): cellTypes batch
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
xin$cellTypes[xin$cellTypes=="PP"] <- "gamma"

table(baron$cellTypes)
## 
##      acinar       alpha        beta       delta      ductal endothelial 
##         958        2326        2525         601        1077         252 
##     epsilon       gamma      immune  macrophage        mast     schwann 
##          18         255           7          55          25          13 
##    stellate 
##         457
table(muraro1$cellTypes)
## 
##      acinar       alpha        beta       delta      ductal       gamma 
##         249         265          22         123         298          30 
## mesenchymal 
##          21
table(muraro2$cellTypes)
## 
##      acinar       alpha        beta       delta      ductal endothelial 
##         219         812         448         193         245          21 
##     epsilon       gamma mesenchymal 
##           3         101          80
table(segerstolpe$cellTypes)
## 
##                 acinar                  alpha                   beta 
##                    185                    886                    270 
##          co-expression                  delta                 ductal 
##                     39                    114                    386 
##            endothelial                epsilon                  gamma 
##                     16                      7                    197 
##                   mast           MHC class II               stellate 
##                      7                      5                     54 
## unclassified endocrine 
##                     41
table(lawlor$cellTypes)
## 
##   acinar    alpha     beta    delta   ductal    gamma stellate 
##       24      239      264       25       28       18       19
table(xin$cellTypes)
## 
## alpha  beta delta gamma 
##   885   461    49    79
table(wang$cellTypes)
## 
##      acinar       alpha        beta       delta      ductal       gamma 
##           5         206         118          10          96          21 
## mesenchymal 
##          45
muraro2$cellTypes[muraro2$cellTypes == "mesenchymal"] <- "stellate"

wang$cellTypes[wang$cellTypes == "mesenchymal"] <- "stellate"

segerstolpe <- segerstolpe[,!segerstolpe$cellTypes%in%c("co-expression","unclassified endocrine")]

pancreas_cellTypes <- list(
  baron = baron$cellTypes,
  muraro2 = muraro2$cellTypes,
  segerstolpe = segerstolpe$cellTypes,
  wang = wang$cellTypes,
  xin = xin$cellTypes,
  lawlor = lawlor$cellTypes
)
muraro_scClassify_ensemble_res <- readRDS("ensembleRes/muraro_scClassify_ensemble_res.rds")
lawlor_scClassify_ensemble_res <- readRDS("ensembleRes/lawlor_scClassify_ensemble_res.rds")
segerstolpe_scClassify_ensemble_res <- readRDS("ensembleRes/segerstolpe_scClassify_ensemble_res.rds")
wang_scClassify_ensemble_res <- readRDS("ensembleRes/wang_scClassify_ensemble_res.rds")
xin_scClassify_ensemble_res <- readRDS("ensembleRes/xin_scClassify_ensemble_res.rds")
baron_scClassify_ensemble_res <- readRDS("ensembleRes/baron_scClassify_ensemble_res.rds")

Helper functions

(It is incorporated to scClassify package)

getEnsembleError <- function(res){
  ensembleErr <- do.call(rbind, lapply(res, function(x) table(x$classifyRes)/length(x$classifyRes)))
  return(ensembleErr)
}


getEnsembleRes <- function(res, exclude = NULL, weight = NULL){
  
  if(!is.null(exclude)){
    keep_method <- !grepl(paste(exclude,collapse = "|"), names(res))
    res <- res[keep_method]
    weight <- weight[keep_method]
  }
  
  if(is.null(weight)){
    weight <- rep(1, length(res))
  }
  
  ensembleResMat <- do.call(cbind, lapply(res, function(x) x$predRes))
  
  ensembleRes <- apply(ensembleResMat, 1, function(x) {
    names(x) <- colnames(ensembleResMat)
    # keep <- x!="unassigned"
    keep <- rep(TRUE, length(x))
    if(sum(keep) == 0){
      data.frame(cellTypes = "unassigned", scores = 0)
    }else{
      # tab <- table(x)/length(x)
      # ensemble_cellTypes <- names(tab)[which.max(tab)]
      # ensemble_scores <- max(tab)
      getResByWeights(x[keep], weight[keep])
    }
    # data.frame(cellTypes = ensemble_cellTypes, scores = ensemble_scores)
    
    
  })
  
  ensembleRes <- do.call(rbind, ensembleRes)
  return(ensembleRes)
}


getResByWeights <- function(res, weight) {
  resType <- unique(res)
  if(length(resType) == 1){
    final <- resType
    scores <- 1
  }else{
    
    
    mat <- sapply(1:length(resType), function(i){
      ifelse(res %in% resType[i], 1, 0) * weight
    })
    colnames(mat) <- resType
    rownames(mat) <- names(res)
    mat_colMeans <- colMeans(mat)
    final <- names(mat_colMeans)[which.max(mat_colMeans)]
    scores <- max(mat_colMeans)/sum(mat_colMeans)
  }
  return(data.frame(cellTypes = final, scores = scores))
}

2 Calculate Weights

lawlor_ensemble_error <- lapply(lawlor_scClassify_ensemble_res$testRes, getEnsembleError)
xin_ensemble_res <- lapply(xin_scClassify_ensemble_res$testRes, getEnsembleRes)
xin_ensemble_error <- lapply(xin_scClassify_ensemble_res$testRes, getEnsembleError)
wang_ensemble_res <- lapply(wang_scClassify_ensemble_res$testRes, getEnsembleRes)
wang_ensemble_error <- lapply(wang_scClassify_ensemble_res$testRes, getEnsembleError)
segerstolpe_ensemble_res <- lapply(segerstolpe_scClassify_ensemble_res$testRes, getEnsembleRes)
segerstolpe_ensemble_error <- lapply(segerstolpe_scClassify_ensemble_res$testRes, getEnsembleError)
muraro_ensemble_res <- lapply(muraro_scClassify_ensemble_res$testRes, getEnsembleRes)
muraro_ensemble_error <- lapply(muraro_scClassify_ensemble_res$testRes, getEnsembleError)
baron_ensemble_res <- lapply(baron_scClassify_ensemble_res$testRes, getEnsembleRes)
baron_ensemble_error <- lapply(baron_scClassify_ensemble_res$testRes, getEnsembleError)
misclassifiedMat <- cbind(do.call(cbind, lapply(lawlor_ensemble_error, function(x) x[,1] + x[,2])),
                          do.call(cbind, lapply(xin_ensemble_error, function(x) x[,1] + x[,2])),
                          do.call(cbind, lapply(wang_ensemble_error, function(x) x[,1] + x[,2])),
                          do.call(cbind, lapply(segerstolpe_ensemble_error, function(x) x[,1] + x[,2])),
                          do.call(cbind, lapply(muraro_ensemble_error, function(x) x[,1] + x[,2])),
                          do.call(cbind, lapply(baron_ensemble_error, function(x) x[,1] + x[,2])))
colnames(misclassifiedMat) <- c(paste("lawlor_", names(lawlor_ensemble_error), sep = ""), 
                                paste("xin_", names(xin_ensemble_error), sep = ""), 
                                paste("wang_", names(wang_ensemble_error), sep = ""), 
                                paste("segerstolpe_", names(segerstolpe_ensemble_error), sep = ""), 
                                paste("muraro_", names(muraro_ensemble_error), sep = ""),
                                paste("baron_", names(baron_ensemble_error), sep = ""))

rownames(misclassifiedMat)[grep("weighted_rank", rownames(misclassifiedMat))] <- gsub("weighted_rank", "weighted.rank", rownames(misclassifiedMat)[grep("weighted_rank", rownames(misclassifiedMat))])
anno_col <- data.frame(feature = unlist(lapply(strsplit(rownames(misclassifiedMat), "_"), "[[", 3)),
                       distance = unlist(lapply(strsplit(rownames(misclassifiedMat), "_"), "[[", 1)),
                       KNN = unlist(lapply(strsplit(rownames(misclassifiedMat), "_"), "[[", 2)))
rownames(anno_col) <- rownames(misclassifiedMat)

anno_colour <- list(feature = tableau_color_pal(palette = "Purple-Pink-Gray")(5)[c(1,2,4,5,3)],
                    distance = tableau_color_pal(palette = "Tableau 10")(6),
                    KNN = tableau_color_pal(palette = "Winter")(6)[c(3,4)])

names(anno_colour$feature) <- levels(anno_col$feature)
names(anno_colour$distance) <- levels(anno_col$distance)
names(anno_colour$KNN) <- levels(anno_col$KNN)


color_forHeatmap <- rev(viridis::viridis(200, begin = 0.1))



toplot <- misclassifiedMat[order(round(apply(misclassifiedMat, 1, mean), 3), anno_col$distance, anno_col$feature),]
toplot <- toplot[grepl("WKNN", rownames(toplot)), ]


breaks <- c(seq(0.2, 0.5, 0.1), seq(0.51, 1, 0.0001))
color_forHeatmap <- (viridis::plasma(length(breaks), begin = 0))

pheatmap(t(toplot),
         cluster_rows = F,
         cluster_cols = F,
         color = color_forHeatmap,
         annotation_col = anno_col,
         annotation_colors = anno_colour,
         breaks = breaks)

pheatmap(t(toplot),
         cluster_rows = F,
         cluster_cols = F,
         color = color_forHeatmap,
         annotation_col = anno_col,
         annotation_colors = anno_colour,
         breaks = breaks,
         filename = "figures/Figure1C_heatmap_misclassified_pancreasSix_ensemble_revision_plasma.pdf",
         width = 12,
         height = 10)

write.csv(toplot, file = "ensembleRes/pancreas_ensemble_accuracy_rate_mat.csv", row.names = TRUE)
lawlor_scClassify_ensemble_res_itself <- readRDS("ensembleRes/lawlor_scClassify_ensemble_res_itself.rds")


lawlor_ensemble_acc_train <- lapply(lawlor_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}



lawlor_ensemble_res_weights <- lapply(lawlor_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - lawlor_ensemble_acc_train)))

lawlor_ensemble_res_final_weights <- rbind(xin = freqError(ClassifyError(as.character(lawlor_ensemble_res_weights$xin$cellTypes), xin$cellTypes, lawlor$cellTypes)),
                                           wang = freqError(ClassifyError(as.character(lawlor_ensemble_res_weights$wang$cellTypes), wang$cellTypes, lawlor$cellTypes)),
                                           segerstolpe = freqError(ClassifyError(as.character(lawlor_ensemble_res_weights$segerstolpe$cellTypes), segerstolpe$cellTypes, lawlor$cellTypes)),
                                           baron = freqError(ClassifyError(as.character(lawlor_ensemble_res_weights$baron$cellTypes), baron$cellTypes, lawlor$cellTypes)),
                                           muraro = freqError(ClassifyError(as.character(lawlor_ensemble_res_weights$muraro$cellTypes), muraro2$cellTypes, lawlor$cellTypes)))



lawlor_scClassify_res_pancreas_pearson <- rbind(xin = freqError(ClassifyError(as.character(lawlor_scClassify_ensemble_res$testRes$xin$pearson_WKNN_limma$predRes), xin$cellTypes, lawlor$cellTypes)),
                                                wang = freqError(ClassifyError(as.character(lawlor_scClassify_ensemble_res$testRes$wang$pearson_WKNN_limma$predRes), wang$cellTypes, lawlor$cellTypes)),
                                                segerstolpe = freqError(ClassifyError(as.character(lawlor_scClassify_ensemble_res$testRes$segerstolpe$pearson_WKNN_limma$predRes), segerstolpe$cellTypes, lawlor$cellTypes)),
                                                baron = freqError(ClassifyError(as.character(lawlor_scClassify_ensemble_res$testRes$baron$pearson_WKNN_limma$predRes), baron$cellTypes, lawlor$cellTypes)),
                                                muraro = freqError(ClassifyError(as.character(lawlor_scClassify_ensemble_res$testRes$muraro$pearson_WKNN_limma$predRes), muraro2$cellTypes, lawlor$cellTypes)))
wang_scClassify_ensemble_res_itself <- readRDS("ensembleRes/wang_scClassify_ensemble_res_itself.rds")


wang_ensemble_acc_train <- lapply(wang_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}



wang_ensemble_res_weights <- lapply(wang_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - wang_ensemble_acc_train)))

wang_ensemble_res_final_weights <- rbind(xin = freqError(ClassifyError(as.character(wang_ensemble_res_weights$xin$cellTypes), xin$cellTypes, wang$cellTypes)),
                                         lawlor = freqError(ClassifyError(as.character(wang_ensemble_res_weights$lawlor$cellTypes), lawlor$cellTypes, wang$cellTypes)),
                                         segerstolpe = freqError(ClassifyError(as.character(wang_ensemble_res_weights$segerstolpe$cellTypes), segerstolpe$cellTypes, wang$cellTypes)),
                                         baron = freqError(ClassifyError(as.character(wang_ensemble_res_weights$baron$cellTypes), baron$cellTypes, wang$cellTypes)),
                                         muraro = freqError(ClassifyError(as.character(wang_ensemble_res_weights$muraro$cellTypes), muraro2$cellTypes, wang$cellTypes)))





wang_scClassify_res_pancreas_pearson <- rbind(xin = freqError(ClassifyError(as.character(wang_scClassify_ensemble_res$testRes$xin$pearson_WKNN_limma$predRes), xin$cellTypes, wang$cellTypes)),
                                              lawlor = freqError(ClassifyError(as.character(wang_scClassify_ensemble_res$testRes$lawlor$pearson_WKNN_limma$predRes), lawlor$cellTypes, wang$cellTypes)),
                                              segerstolpe = freqError(ClassifyError(as.character(wang_scClassify_ensemble_res$testRes$segerstolpe$pearson_WKNN_limma$predRes), segerstolpe$cellTypes, wang$cellTypes)),
                                              baron = freqError(ClassifyError(as.character(wang_scClassify_ensemble_res$testRes$baron$pearson_WKNN_limma$predRes), baron$cellTypes, wang$cellTypes)),
                                              muraro = freqError(ClassifyError(as.character(wang_scClassify_ensemble_res$testRes$muraro$pearson_WKNN_limma$predRes), muraro2$cellTypes, wang$cellTypes)))
baron_scClassify_ensemble_res_itself <- readRDS("ensembleRes/baron_scClassify_ensemble_res_itself.rds")


baron_ensemble_acc_train <- lapply(baron_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}



baron_ensemble_res_weights <- lapply(baron_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - baron_ensemble_acc_train)))

baron_ensemble_res_final_weights <- rbind(xin = freqError(ClassifyError(as.character(baron_ensemble_res_weights$xin$cellTypes), xin$cellTypes, baron$cellTypes)),
                                          lawlor = freqError(ClassifyError(as.character(baron_ensemble_res_weights$lawlor$cellTypes), lawlor$cellTypes, baron$cellTypes)),
                                          segerstolpe = freqError(ClassifyError(as.character(baron_ensemble_res_weights$segerstolpe$cellTypes), segerstolpe$cellTypes, baron$cellTypes)),
                                          wang = freqError(ClassifyError(as.character(baron_ensemble_res_weights$wang$cellTypes), wang$cellTypes, baron$cellTypes)),
                                          muraro = freqError(ClassifyError(as.character(baron_ensemble_res_weights$muraro$cellTypes), muraro2$cellTypes, baron$cellTypes)))





baron_scClassify_res_pancreas_pearson <- rbind(xin = freqError(ClassifyError(as.character(baron_scClassify_ensemble_res$testRes$xin$pearson_WKNN_limma$predRes), xin$cellTypes, baron$cellTypes)),
                                               lawlor = freqError(ClassifyError(as.character(baron_scClassify_ensemble_res$testRes$lawlor$pearson_WKNN_limma$predRes), lawlor$cellTypes, baron$cellTypes)),
                                               segerstolpe = freqError(ClassifyError(as.character(baron_scClassify_ensemble_res$testRes$segerstolpe$pearson_WKNN_limma$predRes), segerstolpe$cellTypes, baron$cellTypes)),
                                               wang = freqError(ClassifyError(as.character(baron_scClassify_ensemble_res$testRes$wang$pearson_WKNN_limma$predRes), wang$cellTypes, baron$cellTypes)),
                                               muraro = freqError(ClassifyError(as.character(baron_scClassify_ensemble_res$testRes$muraro$pearson_WKNN_limma$predRes), muraro2$cellTypes, baron$cellTypes)))
segerstolpe_scClassify_ensemble_res_itself <- readRDS("ensembleRes/segerstolpe_scClassify_ensemble_res_itself.rds")


segerstolpe_ensemble_acc_train <- lapply(segerstolpe_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}



segerstolpe_ensemble_res_weights <- lapply(segerstolpe_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - segerstolpe_ensemble_acc_train)))




segerstolpe_ensemble_res_final_weights <- rbind(xin = freqError(ClassifyError(as.character(segerstolpe_ensemble_res_weights$xin$cellTypes), xin$cellTypes, segerstolpe$cellTypes)),
                                                lawlor = freqError(ClassifyError(as.character(segerstolpe_ensemble_res_weights$lawlor$cellTypes), lawlor$cellTypes, segerstolpe$cellTypes)),
                                                wang = freqError(ClassifyError(as.character(segerstolpe_ensemble_res_weights$wang$cellTypes), wang$cellTypes, segerstolpe$cellTypes)),
                                                baron = freqError(ClassifyError(as.character(segerstolpe_ensemble_res_weights$baron$cellTypes), baron$cellTypes, segerstolpe$cellTypes)),
                                                muraro = freqError(ClassifyError(as.character(segerstolpe_ensemble_res_weights$muraro$cellTypes), muraro2$cellTypes, segerstolpe$cellTypes)))




segerstolpe_scClassify_res_pancreas_pearson <- rbind(xin = freqError(ClassifyError(as.character(segerstolpe_scClassify_ensemble_res$testRes$xin$pearson_WKNN_limma$predRes), xin$cellTypes, segerstolpe$cellTypes)),
                                                     lawlor = freqError(ClassifyError(as.character(segerstolpe_scClassify_ensemble_res$testRes$lawlor$pearson_WKNN_limma$predRes), lawlor$cellTypes, segerstolpe$cellTypes)),
                                                     wang = freqError(ClassifyError(as.character(segerstolpe_scClassify_ensemble_res$testRes$wang$pearson_WKNN_limma$predRes), wang$cellTypes, segerstolpe$cellTypes)),
                                                     baron = freqError(ClassifyError(as.character(segerstolpe_scClassify_ensemble_res$testRes$baron$pearson_WKNN_limma$predRes), baron$cellTypes, segerstolpe$cellTypes)),
                                                     muraro = freqError(ClassifyError(as.character(segerstolpe_scClassify_ensemble_res$testRes$muraro$pearson_WKNN_limma$predRes), muraro2$cellTypes, segerstolpe$cellTypes)))
xin_scClassify_ensemble_res_itself <- readRDS("ensembleRes/xin_scClassify_ensemble_res_itself.rds")


xin_ensemble_acc_train <- lapply(xin_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}

xin_ensemble_res_weights <- lapply(xin_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - xin_ensemble_acc_train)))

xin_ensemble_res_final_weights <- rbind(muraro = freqError(ClassifyError(as.character(xin_ensemble_res_weights$muraro$cellTypes), muraro2$cellTypes, xin$cellTypes)),
                                        lawlor = freqError(ClassifyError(as.character(xin_ensemble_res_weights$lawlor$cellTypes), lawlor$cellTypes, xin$cellTypes)),
                                        wang = freqError(ClassifyError(as.character(xin_ensemble_res_weights$wang$cellTypes), wang$cellTypes, xin$cellTypes)),
                                        baron = freqError(ClassifyError(as.character(xin_ensemble_res_weights$baron$cellTypes), baron$cellTypes, xin$cellTypes)),
                                        segerstolpe = freqError(ClassifyError(as.character(xin_ensemble_res_weights$segerstolpe$cellTypes), segerstolpe$cellTypes, xin$cellTypes)))


xin_scClassify_res_pancreas_pearson <- rbind(muraro = freqError(ClassifyError(as.character(xin_scClassify_ensemble_res$testRes$muraro$pearson_WKNN_limma$predRes), muraro2$cellTypes, xin$cellTypes)),
                                             lawlor = freqError(ClassifyError(as.character(xin_scClassify_ensemble_res$testRes$lawlor$pearson_WKNN_limma$predRes), lawlor$cellTypes, xin$cellTypes)),
                                             wang = freqError(ClassifyError(as.character(xin_scClassify_ensemble_res$testRes$wang$pearson_WKNN_limma$predRes), wang$cellTypes, xin$cellTypes)),
                                             baron = freqError(ClassifyError(as.character(xin_scClassify_ensemble_res$testRes$baron$pearson_WKNN_limma$predRes), baron$cellTypes, xin$cellTypes)),
                                             segerstolpe = freqError(ClassifyError(as.character(xin_scClassify_ensemble_res$testRes$segerstolpe$pearson_WKNN_limma$predRes), segerstolpe$cellTypes, xin$cellTypes)))
muraro_scClassify_ensemble_res_itself <- readRDS("ensembleRes/muraro_scClassify_ensemble_res_itself.rds")


muraro_ensemble_acc_train <- lapply(muraro_scClassify_ensemble_res_itself$testRes, getEnsembleError)[[1]][,1]


alpha <- function(e) {
  log((1 - e)/e)
}

muraro_ensemble_res_weights <- lapply(muraro_scClassify_ensemble_res$testRes, function(x) 
  getEnsembleRes(x, exclude = c("_KNN"),
                 weight = alpha(1 - muraro_ensemble_acc_train)))

muraro_ensemble_res_final_weights <- rbind(xin = freqError(ClassifyError(as.character(muraro_ensemble_res_weights$xin$cellTypes), xin$cellTypes, muraro2$cellTypes)),
                                           lawlor = freqError(ClassifyError(as.character(muraro_ensemble_res_weights$lawlor$cellTypes), lawlor$cellTypes, muraro2$cellTypes)),
                                           wang = freqError(ClassifyError(as.character(muraro_ensemble_res_weights$wang$cellTypes), wang$cellTypes, muraro2$cellTypes)),
                                           baron = freqError(ClassifyError(as.character(muraro_ensemble_res_weights$baron$cellTypes), baron$cellTypes, muraro2$cellTypes)),
                                           segerstolpe = freqError(ClassifyError(as.character(muraro_ensemble_res_weights$segerstolpe$cellTypes), segerstolpe$cellTypes, muraro2$cellTypes)))



muraro_scClassify_res_pancreas_pearson <- rbind(xin = freqError(ClassifyError(as.character(muraro_scClassify_ensemble_res$testRes$xin$pearson_WKNN_limma$predRes), xin$cellTypes, muraro2$cellTypes)),
                                                lawlor = freqError(ClassifyError(as.character(muraro_scClassify_ensemble_res$testRes$lawlor$pearson_WKNN_limma$predRes), lawlor$cellTypes, muraro2$cellTypes)),
                                                wang = freqError(ClassifyError(as.character(muraro_scClassify_ensemble_res$testRes$wang$pearson_WKNN_limma$predRes), wang$cellTypes, muraro2$cellTypes)),
                                                baron = freqError(ClassifyError(as.character(muraro_scClassify_ensemble_res$testRes$baron$pearson_WKNN_limma$predRes), baron$cellTypes, muraro2$cellTypes)),
                                                segerstolpe = freqError(ClassifyError(as.character(muraro_scClassify_ensemble_res$testRes$segerstolpe$pearson_WKNN_limma$predRes), segerstolpe$cellTypes, muraro2$cellTypes)))

3 Ensemble results

freqError <- function(x){
  x <- factor(x, levels = errClass)
  table(x)/length(x)
}
lawlor_pearson_res <- do.call(rbind, lapply(lawlor_ensemble_error, function(x) x[1,]))
wang_pearson_res <- do.call(rbind, lapply(wang_ensemble_error, function(x) x[1,]))
xin_pearson_res <- do.call(rbind, lapply(xin_ensemble_error, function(x) x[1,]))
segerstolpe_pearson_res <- do.call(rbind, lapply(segerstolpe_ensemble_error, function(x) x[1,]))
baron_pearson_res <- do.call(rbind, lapply(baron_ensemble_error, function(x) x[1,]))
muraro_pearson_res <- do.call(rbind, lapply(muraro_ensemble_error, function(x) x[1,]))


pearson_res <- rbind(baron_pearson_res,
                     muraro_pearson_res,
                     lawlor_pearson_res,
                     segerstolpe_pearson_res,
                     xin_pearson_res,
                     wang_pearson_res)

rownames(pearson_res) <- c(paste("baron_", rownames(baron_pearson_res), sep = ""),
                           paste("muraro_", rownames(muraro_pearson_res), sep = ""),
                           paste("lawlor_", rownames(lawlor_pearson_res), sep = ""),
                           paste("segerstolpe_", rownames(segerstolpe_pearson_res), sep = ""),
                           paste("xin_", rownames(xin_pearson_res), sep = ""),
                           paste("wang_", rownames(wang_pearson_res), sep = ""))
ensemble_res_final_weights <- rbind(baron_ensemble_res_final_weights,
                                    muraro_ensemble_res_final_weights,
                                    lawlor_ensemble_res_final_weights,
                                    segerstolpe_ensemble_res_final_weights,
                                    xin_ensemble_res_final_weights,
                                    wang_ensemble_res_final_weights)

rownames(ensemble_res_final_weights) <- c(paste("baron_", rownames(baron_ensemble_res_final_weights), sep = ""),
                                          paste("muraro_", rownames(muraro_ensemble_res_final_weights), sep = ""),
                                          paste("lawlor_", rownames(lawlor_ensemble_res_final_weights), sep = ""),
                                          paste("segerstolpe_", rownames(segerstolpe_ensemble_res_final_weights), sep = ""),
                                          paste("xin_", rownames(xin_ensemble_res_final_weights), sep = ""),
                                          paste("wang_", rownames(wang_ensemble_res_final_weights), sep = ""))


saveRDS(ensemble_res_final_weights, "results/pancreas_scClassifyRes_EnsembleWeighted.rds")





scClassify_res_pancreas_pearson <- rbind(baron_scClassify_res_pancreas_pearson,
                                         muraro_scClassify_res_pancreas_pearson,
                                         lawlor_scClassify_res_pancreas_pearson,
                                         segerstolpe_scClassify_res_pancreas_pearson,
                                         xin_scClassify_res_pancreas_pearson,
                                         wang_scClassify_res_pancreas_pearson)

rownames(scClassify_res_pancreas_pearson) <- c(paste("baron_", rownames(baron_scClassify_res_pancreas_pearson), sep = ""),
                                               paste("muraro_", rownames(muraro_scClassify_res_pancreas_pearson), sep = ""),
                                               paste("lawlor_", rownames(lawlor_scClassify_res_pancreas_pearson), sep = ""),
                                               paste("segerstolpe_", rownames(segerstolpe_scClassify_res_pancreas_pearson), sep = ""),
                                               paste("xin_", rownames(xin_scClassify_res_pancreas_pearson), sep = ""),
                                               paste("wang_", rownames(wang_scClassify_res_pancreas_pearson), sep = ""))


saveRDS(scClassify_res_pancreas_pearson, "results/pancreas_scClassifyRes_pearsonDE.rds")

4 Benchmarking

4.1 ACTINN

files <- list.files("results/benchmarking_results/ACTINN/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

ACTINN_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/ACTINN/result/', cases[i], '_ACTINN_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/ACTINN/result/', cases[i], '_ACTINN_True.csv', sep = ""))
  
  ACTINN_res[[i]] <- ClassifyError(as.character(res$X0), 
                                   cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                   cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  ACTINN_res[[i]] <- table(factor(ACTINN_res[[i]], levels = errClass))/length(ACTINN_res[[i]])
}

names(ACTINN_res) <- cases
ACTINN_res <- do.call(rbind, ACTINN_res)

4.2 Castle

files <- list.files("results/benchmarking_results/CaSTLe/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

Castle_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/CaSTLe/result/', cases[i], '_Pred_CaSTLe.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/CaSTLe/result/', cases[i], '_True_CaSTLe.csv', sep = ""))
  Castle_res[[i]] <- ClassifyError(as.character(res$x), 
                                   cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                   cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  Castle_res[[i]] <- table(factor(Castle_res[[i]], levels = errClass))/length(Castle_res[[i]])
}


names(Castle_res) <- cases
Castle_res <- do.call(rbind, Castle_res)

4.3 CHETAH

human_idx <- grep("human", colnames(baron))

files <- list.files("results/benchmarking_results/CHETAH/pancreas/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

CHETAH_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/CHETAH/pancreas/', cases[i], '_CHETAH_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/CHETAH/pancreas/', cases[i], '_CHETAH_True.csv', sep = ""))
  if(strsplit(cases[i], "_")[[1]][2] == "baron") {
    CHETAH_res[[i]] <- ClassifyError(as.character(res$x)[human_idx], 
                                     cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                     cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  }else {
    CHETAH_res[[i]] <- ClassifyError(as.character(res$x), 
                                     cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                     cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  }
  
  CHETAH_res[[i]] <- table(factor(CHETAH_res[[i]], levels = errClass))/length(CHETAH_res[[i]])
}


names(CHETAH_res) <- cases

CHETAH_res <- do.call(rbind, CHETAH_res)

4.4 scID

files <- list.files("results/benchmarking_results/scID", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

scID_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/scID/', cases[i], '_scID_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/scID/', cases[i], '_scID_True.csv', sep = ""))
  scID_res[[i]] <- ClassifyError(as.character(res$x), 
                                 cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                 cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  scID_res[[i]] <- table(factor(scID_res[[i]], levels = errClass))/length(scID_res[[i]])
}


names(scID_res) <- cases
scID_res <- do.call(rbind, scID_res)

4.5 singleR

files <- list.files("results/benchmarking_results/singleR/result/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

singleR_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/singleR/result/', cases[i], '_SingleR_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/singleR/result/', cases[i], '_SingleR_True.csv', sep = ""))
  singleR_res[[i]] <- ClassifyError(as.character(res$x), 
                                    cellTypes_test = pancreas_cellTypes[[gsub(" ", "", strsplit(cases[i], "_")[[1]][2])]], 
                                    cellTypes_train = pancreas_cellTypes[[gsub(" ", "", strsplit(cases[i], "_")[[1]][1])]])
  singleR_res[[i]] <- table(factor(singleR_res[[i]], levels = errClass))/length(singleR_res[[i]])
}


names(singleR_res) <- cases
singleR_res <- do.call(rbind, singleR_res)

4.6 scPred

files <- list.files("results/benchmarking_results/scPred/result/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

scPred_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/scPred/result/', cases[i], '_scPred_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/scPred/result/', cases[i], '_scPred_True.csv', sep = ""))
  scPred_res[[i]] <- ClassifyError(as.character(res$x), 
                                   cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                   cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  scPred_res[[i]] <- table(factor(scPred_res[[i]], levels = errClass))/length(scPred_res[[i]])
}


names(scPred_res) <- cases
scPred_res <- do.call(rbind, scPred_res)

4.7 scmap-cell

files <- list.files("results/benchmarking_results/scmap/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

scmapCell_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/scmap/result/', cases[i], '_scmapcell_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/scmap/result/', cases[i], '_scmapcell_True.csv', sep = ""))
  scmapCell_res[[i]] <- ClassifyError(as.character(res$x), 
                                      cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                      cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  scmapCell_res[[i]] <- table(factor(scmapCell_res[[i]], levels = errClass))/length(scmapCell_res[[i]])
}


names(scmapCell_res) <- cases
scmapCell_res <- do.call(rbind, scmapCell_res)

4.8 scmap-cluster

files <- list.files("results/benchmarking_results/scmap/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

scmapCluster_res <- list()
for (i in 1:length(cases)){
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/scmap/result/', cases[i], '_scmapcluster_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/scmap/result/', cases[i], '_scmapcluster_True.csv', sep = ""))
  scmapCluster_res[[i]] <- ClassifyError(as.character(res$x), 
                                         cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                         cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  scmapCluster_res[[i]] <- table(factor(scmapCluster_res[[i]], levels = errClass))/length(scmapCluster_res[[i]])
}


names(scmapCluster_res) <- cases
scmapCluster_res <- do.call(rbind, scmapCluster_res)

4.9 Garnett

files <- list.files("results/benchmarking_results/Garnett/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

Garnett_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/Garnett/result/', cases[i], '_Garnett_CV_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/Garnett/result/', cases[i], '_Garnett_CV_True.csv', sep = ""))
  Garnett_res[[i]] <- ClassifyError(as.character(res$x), 
                                    cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                    cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  Garnett_res[[i]] <- table(factor(Garnett_res[[i]], levels = errClass))/length(Garnett_res[[i]])
}


names(Garnett_res) <- cases
Garnett_res <- do.call(rbind, Garnett_res)

4.10 singleCellNet

files <- list.files("results/benchmarking_results/singlecellNet/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

singleCellNet_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/singlecellNet/', cases[i], '_singleCellNet_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/singlecellNet/', cases[i], '_singleCellNet_True.csv', sep = ""))
  singleCellNet_res[[i]] <- ClassifyError(as.character(res$x[1:length(true$x)]), 
                                          cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                          cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  singleCellNet_res[[i]] <- table(factor(singleCellNet_res[[i]], levels = errClass))/length(singleCellNet_res[[i]])
}


names(singleCellNet_res) <- cases
singleCellNet_res <- do.call(rbind, singleCellNet_res)

4.11 scVI

files <- list.files("results/benchmarking_results/scVI/result", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))
cases <- cases[!grepl(".csv", cases)]
scVI_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/scVI/result/', cases[i], '_scVI_Pred_map.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/scVI/result/', cases[i], '_scVI_True_map.csv', sep = ""))
  # scVI_res[[i]] <- sum(res$X0 == true$X0)/length(res$X0)
  scVI_res[[i]] <- ClassifyError(as.character(res$X0),
                                 cellTypes_test = true$X0,
                                 cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  scVI_res[[i]] <- table(factor(scVI_res[[i]], levels = errClass))/length(scVI_res[[i]])
}


names(scVI_res) <- cases
scVI_res <- do.call(rbind, scVI_res)

4.12 Moana

files <- list.files("results/benchmarking_results/moana/result/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

moana_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/moana/result/', cases[i], '_moana_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/moana/result/', cases[i], '_moana_True.csv', sep = ""))
  moana_res[[i]] <- ClassifyError(as.character(res$Predicted.cell.type), 
                                  cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                  cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  moana_res[[i]] <- table(factor(moana_res[[i]], levels = errClass))/length(moana_res[[i]])
}


names(moana_res) <- cases
moana_res <- do.call(rbind, moana_res)

5 Garnett DE

files <- list.files("results/benchmarking_results/Garnett_DE/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

Garnett_DE_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/Garnett_DE/', cases[i], '_Garnett_DE_Pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/Garnett_DE/', cases[i], '_Garnett_DE_True.csv', sep = ""))
  Garnett_DE_res[[i]] <- ClassifyError(as.character(res$x), 
                                       cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                       cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  Garnett_DE_res[[i]] <- table(factor(Garnett_DE_res[[i]], levels = errClass))/length(Garnett_DE_res[[i]])
}


names(Garnett_DE_res) <- cases
Garnett_DE_res <- do.call(rbind, Garnett_DE_res)

5.1 SVM

files <- list.files("results/benchmarking_results/SVM_reject/", pattern = ".csv")
cases <- unique(unlist(lapply(strsplit(files, "_"), function(x) paste(x[1], x[2], sep = "_"))))

SVM_reject_res <- list()
for (i in 1:length(cases)) {
  # print(cases[i])
  res <- read.csv(paste('results/benchmarking_results/SVM_reject/', cases[i], '_SVMreject_pred.csv', sep = ""))
  true <- read.csv(paste('results/benchmarking_results/SVM_reject/', cases[i], '_SVMreject_true.csv', sep = ""))
  SVM_reject_res[[i]] <- ClassifyError(as.character(res$X0), 
                                       cellTypes_test = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][2]]], 
                                       cellTypes_train = pancreas_cellTypes[[strsplit(cases[i], "_")[[1]][1]]])
  SVM_reject_res[[i]] <- table(factor(SVM_reject_res[[i]], levels = errClass))/length(SVM_reject_res[[i]])
}


names(SVM_reject_res) <- cases
SVM_reject_res <- do.call(rbind, SVM_reject_res)

5.2 scClassify

scClassify_res <- scClassify_res_pancreas_pearson
scClassifyEnsemble_res <- ensemble_res_final_weights

6 Evaluation

sce_list <- list(baron = baron,
                 muraro = muraro2,
                 segerstolpe = segerstolpe,
                 lawlor = lawlor,
                 xin = xin,
                 wang = wang)
cellTypes_num <- unlist(lapply(sce_list, function(x) length(unique(x$cellTypes))))
combination <- combn(names(sce_list), 2)
combination <- cbind(combination, combination[c(2, 1), ])
datasetCharacterise <- cbind(cellTypes_num[combination[1, ]],
                             cellTypes_num[combination[2, ]])
rownames(datasetCharacterise) <- paste(combination[1, ], combination[2, ], sep = "_")
testLargeTrain <- rownames(datasetCharacterise)[datasetCharacterise[,1] - datasetCharacterise[,2] < 0]
saveRDS(testLargeTrain, file = "results/pancreas_testLargeTrain.rds")
all_res <- list(scClassify = scClassify_res,
                scClassify_ensemble = scClassifyEnsemble_res,
                singleR = singleR_res,
                moana = moana_res,
                singlecellNet = singleCellNet_res,
                ACTINN = ACTINN_res,
                scVI = scVI_res,
                CHETAH = CHETAH_res,
                scID = scID_res,
                Garnett = Garnett_res,
                scmapCell = scmapCell_res,
                scmapCluster = scmapCluster_res,
                scPred = scPred_res,
                Garnett_DE = Garnett_DE_res,
                SVM_reject = SVM_reject_res,
                CaSTLe = Castle_res
)

rownames(all_res$scClassify) <- gsub("muraro", "muraro2", rownames(all_res$scClassify))
rownames(all_res$singleR) <- gsub(" ", "", rownames(all_res$singleR))

all_res <- lapply(all_res, function(x){
  x <- data.frame(x)
  x$Experiment <- rownames(x)
  x$Accuracy <- x[,"correct"] + x[,"correctly.unassigned"]
  x$Accuracy_withIntermediate <- x[,"correct"] + x[,"correctly.unassigned"] + x[,"intermediate"]
  x
})

saveRDS(all_res, "results/pancreas_evaluaRes.rds")


df_toPlot <- melt(all_res)
colnames(df_toPlot) <- c("Experiment", "errClass", "value", "Method")
df_toPlot$Method <- factor(as.character(df_toPlot$Method), levels = names(all_res))

df_toPlot$Hard <- as.character(df_toPlot$Experiment) %in% testLargeTrain
df_toPlot$Hard <- ifelse(df_toPlot$Hard, "Hard", "Easy")
method_col <- c(tableau_color_pal("Tableau 20")(20)[c(11)], tableau_color_pal("Classic 20")(7)[7], tableau_color_pal("Tableau 20")(20)[c(1:10, 13:20)])
ggplot(df_toPlot[df_toPlot$errClass == "Accuracy_withIntermediate",], aes(x = Method, y = value, color = Method)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(position = position_jitter(seed = 1), alpha = 0.6, size = 1.5) +
  scale_color_manual(values = method_col) +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size=12),
        panel.border = element_rect(colour = "black", fill=NA, size=1.2)) +
  facet_wrap(~Hard) +
  ylab("Accuracy Rate (Including intermediate)") +
  theme(legend.position = "bottom", text = element_text(size = 14, face = "bold"), axis.text.x = element_blank())

ggsave("figures/Figure2A_pancreasSix_evaluation_easyHard.pdf", width = 9, height = 6)

7 Hyperparameter investigation

7.1 KNN’s K

load("results/pancreas_k_evaluation.RData")
df_toPlot <- rbind(melt(lapply(baron_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("baron", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(segerstolpe_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("segerstolpe", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(wang_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("wang", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(lawlor_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("lawlor", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(xin_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("xin", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(muraro_scClassify_res_k_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("muraro", rownames(x), sep = "_")
  melt(x)
})))
df_toPlot$hard <- ifelse(df_toPlot$case %in% testLargeTrain, "Hard", "Easy")
ggplot(df_toPlot[df_toPlot$variable == "accuracy_rate", ], aes(x = L1, y = value, color = L1)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(position = position_jitter(seed = 1), alpha = 0.6, size = 1.5) +
  facet_grid(~hard) +
  ylab("Classification Accuracy") +
  scale_color_viridis_d(alpha = 0.5, end = 0.9, direction = -1) +
  xlab("") +
  ylim(c(0, 1)) +
  theme_bw()  +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size=12),
        panel.border = element_rect(colour = "black", fill=NA, size=1.2))+
  theme(legend.position = "none", text = element_text(size = 14, face = "bold"))

ggsave("figures/FigureEV2C_pancreasSix_k_evaluation_accuracy_easyHard.pdf", width = 10, height = 6)

7.2 Correlation Threshold

load("results/pancreas_cor_evaluation.RData")
df_toPlot_cor <- rbind(melt(lapply(baron_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("baron", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(segerstolpe_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("segerstolpe", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(wang_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("wang", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(lawlor_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("lawlor", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(xin_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("xin", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(muraro_scClassify_res_cor_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("muraro", rownames(x), sep = "_")
  melt(x)
})))
df_toPlot_cor$hard <- ifelse(df_toPlot_cor$case %in% testLargeTrain, "Hard", "Easy")
df_toPlot_original <- df_toPlot[df_toPlot$L1 == "k=10", ]
df_toPlot_original$L1 <- "dynamic"
df_toPlot_cor <- rbind(df_toPlot_original, df_toPlot_cor)
df_toPlot_cor$L1 <- factor(df_toPlot_cor$L1, levels = unique(df_toPlot_cor$L1))

ggplot(df_toPlot_cor[df_toPlot_cor$variable == "accuracy_rate", ], aes(x = L1, y = value, color = L1)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(position = position_jitter(seed = 1), alpha = 0.6, size = 1.5) +
  facet_grid(~hard) +
  ylab("Classification Accuracy") +
  scale_color_viridis_d(alpha = 0.5, end = 0.85, direction = -1) +
  xlab("") +
  ylim(c(0, 1)) +
  theme_bw()  +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size=12),
        panel.border = element_rect(colour = "black", fill=NA, size=1.2))+
  theme(legend.position = "none", text = element_text(size = 14, face = "bold"))

ggsave("figures/FigureEV2D_pancreasSix_cor_evaluation_accuracy_easyHard.pdf", width = 12, height = 6)
load("results/pancreas_hopach_kmax_evaluation.RData")
df_toPlot_hopach_kmax <- rbind(melt(lapply(baron_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("baron", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(segerstolpe_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("segerstolpe", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(wang_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("wang", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(lawlor_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("lawlor", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(xin_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("xin", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(muraro_scClassify_res_hopach_kmax_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("muraro", rownames(x), sep = "_")
  melt(x)
})))
testLargeTrain <- rownames(datasetCharacterise)[datasetCharacterise[,1] - datasetCharacterise[,2] < 0]

df_toPlot_hopach_kmax$hard <- ifelse(df_toPlot_hopach_kmax$case %in% testLargeTrain, "Hard", "Easy")
ggplot(df_toPlot_hopach_kmax[df_toPlot_hopach_kmax$variable == "accuracy_rate", ], aes(x = L1, y = value, color = L1)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(position = position_jitter(seed = 1), alpha = 0.6, size = 1.5) +
  facet_grid(~hard) +
  ylab("Classification Accuracy") +
  scale_color_viridis_d(alpha = 0.5, end = 0.85, direction = -1) +
  xlab("") +
  ylim(c(0, 1)) +
  theme_bw()  +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size=12),
        panel.border = element_rect(colour = "black", fill=NA, size=1.2)) +
  theme(legend.position = "none", text = element_text(size = 14, face = "bold"))

ggsave("figures/AppendixFigS3_pancreasSix_hopach_kmax_accuracy_easyHard.pdf", width = 12, height = 6)
load("results/pancreas_hopach_subset_evaluation.RData")
df_toPlot <- rbind(melt(lapply(baron_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("baron", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(segerstolpe_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("segerstolpe", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(wang_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("wang", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(lawlor_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("lawlor", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(xin_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("xin", rownames(x), sep = "_")
  melt(x)
})),
melt(lapply(muraro_scClassify_res_hopach_subset_list, function(x) {
  x <- data.frame(x)
  x$accuracy_rate <- x[,1] + x[,2] + x[, 3]
  x$case <- paste("muraro", rownames(x), sep = "_")
  melt(x)
})))
df_toPlot$hard <- ifelse(df_toPlot$case %in% testLargeTrain, "Hard", "Easy")
scClassify_res <- data.frame(scClassify_res_pancreas_pearson)
scClassify_res$Experiment <- rownames(scClassify_res)
scClassify_res$Accuracy <- scClassify_res[, 1] + scClassify_res[, 2] + scClassify_res[, 3] 

ggplot(df_toPlot[df_toPlot$variable == "accuracy_rate", ], aes(x = case, y = value, color = case)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(position = position_jitter(seed = 1), alpha = 0.6, size = 1.5) +
  geom_point(data = scClassify_res, aes(x = Experiment, y = Accuracy), col = "red") +
  #facet_grid(~hard) +
  ylab("Classification Accuracy") +
  scale_color_viridis_d(alpha = 0.5, end = 0.9, direction = -1) +
  xlab("") +
  ylim(c(0, 1)) +
  theme_bw()  +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size=12),
        panel.border = element_rect(colour = "black", fill=NA, size=1.2))+
  theme(legend.position = "none", text = element_text(size = 14, face = "bold"),
        axis.text.x = element_text(angle = 90, hjust = 1),
        aspect.ratio = 1) 

ggsave("figures/FigureEV2B_pancreasSix_subsample_evaluation_accuracy_overall.pdf", width = 6, height = 6)

8 Wang-muraro tSNE plots

set.seed(2019)
muraro2 <- runTSNE(muraro2)
tsne_muraro2 <- reducedDim(muraro2, "TSNE")
df_toPlot_muraro <- data.frame(tSNE1 = tsne_muraro2[,1], tSNE2 = tsne_muraro2[,2], 
                               cellTypes = muraro2$cellTypes,
                               scClassify = as.character(wang_ensemble_res_weights$muraro$cellTypes),
                               ensemble_scores = wang_ensemble_res_weights$muraro$scores)

df_toPlot_muraro$scClassify <- as.character(df_toPlot_muraro$scClassify)

df_toPlot_muraro$scClassify[unlist(lapply(strsplit(df_toPlot_muraro$scClassify, "_"), length))>1] <- "intermediate"

df_toPlot_muraro$scClassify <- factor(df_toPlot_muraro$scClassify,
                                      levels = c(unique(df_toPlot_muraro$scClassify[!df_toPlot_muraro$scClassify %in% c("intermediate", "unassigned")]),
                                                 "intermediate", "unassigned"))

cellTypes_muraro_colour <- c(tableau_color_pal("Tableau 10")(9), tableau_color_pal("Tableau 20")(14)[c(14, 13)])
names(cellTypes_muraro_colour) <- c(names(table(muraro2$cellTypes)), "intermediate", "unassigned")
theme_yx <- function() {
  theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    theme(plot.title = element_text(hjust = 0.5),
          text = element_text(size=12),
          panel.border = element_rect(colour = "black", fill=NA, size=1.2)) 
  
}

g_original <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = cellTypes)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$cellTypes)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Original Label") 

g_original

g_scClassify <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = scClassify)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$scClassify)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scClassify") 

g_scClassify

actinn_res <- read.csv(paste('results/benchmarking_results/ACTINN/result/wang_muraro2_ACTINN_Pred.csv', sep = ""))
df_toPlot_muraro$actinn_res <- actinn_res$X0

table(df_toPlot_muraro$actinn_res, df_toPlot_muraro$cellTypes)
##           
##            acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   alpha         1   798    6    44      2           0       3    32        0
##   beta          1     7  426    64      4           0       0     0        0
##   ductal      217     7   10     2    222           0       0     2        1
##   gamma         0     0    5    83      0           0       0    67        0
##   stellate      0     0    1     0     17          21       0     0       79
g_actinn <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = actinn_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$actinn_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "ACTINN") 


singler_res <- read.csv('results/benchmarking_results/SingleR/result/wang _ muraro2 _SingleR_Pred.csv')
df_toPlot_muraro$singler_res <- singler_res$x

table(df_toPlot_muraro$singler_res, df_toPlot_muraro$cellTypes)
##           
##            acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   acinar      144     0    1     0      3           0       0     2        0
##   alpha         0   803    2    20      2           0       1    10        0
##   beta          0     5  428   118      4           0       0     0        0
##   delta         0     0    0    49      0           0       0     0        0
##   ductal       74     4    7     0    221           0       0     0        1
##   gamma         0     0    7     6      0           0       2    89        0
##   stellate      1     0    3     0     15          21       0     0       79
g_singleR <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = singler_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$singler_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "SingleR") 





moana_res <- read.csv('results/benchmarking_results/moana/result/wang_muraro2_moana_Pred.csv')
df_toPlot_muraro$moana_res <- moana_res$Predicted.cell.type

table(df_toPlot_muraro$moana_res, df_toPlot_muraro$cellTypes)
##           
##            acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   alpha         1   805    3     1      1           0       0     0        0
##   beta          0     4  425    35      4           0       0     0        0
##   delta         0     0    0    19      0           0       0     0        0
##   ductal      217     1    9     0    235           0       0     2        1
##   gamma         0     2    9   138      0           0       3    99        0
##   stellate      1     0    2     0      5          21       0     0       79
g_moana <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = moana_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$moana_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "moana") 


singlecellNet_res <- read.csv('results/benchmarking_results/singlecellNet/wang_muraro2_singleCellNet_Pred.csv')
# singlecellNet_true <- read.csv('/Volumes/HD/iMACdropbox/Dropbox (Sydney Uni)/scclassify/singlecellNet/wang_muraro_singleCellNet_True.csv')
df_toPlot_muraro$singlecellNet_res <- singlecellNet_res$x[1:nrow(df_toPlot_muraro)]
df_toPlot_muraro$singlecellNet_res <- as.character(df_toPlot_muraro$singlecellNet_res )
df_toPlot_muraro$singlecellNet_res[df_toPlot_muraro$singlecellNet_res  == "rand"] <- "unassigned"
table(df_toPlot_muraro$singlecellNet_res, df_toPlot_muraro$cellTypes)
##             
##              acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   alpha           1   804    1    18      0           0       1     7        1
##   beta            0     8  429    62      4           0       1     6        0
##   delta           0     0    0    82      0           0       0     0        0
##   ductal         68     0    3     0    215           0       0     0        1
##   gamma           0     0    5     2      0           0       1    83        0
##   stellate        0     0    0     0      1           9       0     0       72
##   unassigned    150     0   10    29     25          12       0     5        6
df_toPlot_muraro$singlecellNet_res <- as.factor(df_toPlot_muraro$singlecellNet_res )
g_singlecellNet <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = singlecellNet_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$singlecellNet_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "singlecellNet") 





CHETAH_res <- read.csv('results/benchmarking_results/CHETAH/pancreas/wang_muraro2_CHETAH_Pred.csv')
# CHETAH_true <- read.csv('/Volumes/HD/iMACdropbox/Dropbox (Sydney Uni)/scclassify/CHETAH/wang_muraro_CHETAH_True.csv')
df_toPlot_muraro$CHETAH_res <- CHETAH_res$x
df_toPlot_muraro$CHETAH_res <- as.character(df_toPlot_muraro$CHETAH_res )
table(df_toPlot_muraro$CHETAH_res, df_toPlot_muraro$cellTypes)
##                                                
##                                                 acinar alpha beta delta ductal
##   acinar                                           217     0    3     0     42
##   alpha                                              0    35    0     0      0
##   alpha_ductal_beta                                  0    28  192    17      4
##   alpha_ductal_beta_gamma_stellate                   0     0    0     0      0
##   alpha_ductal_beta_stellate                         0     0    0     0      5
##   alpha_ductal_delta_beta_gamma_stellate             0   213    2     3      1
##   alpha_ductal_delta_beta_gamma_stellate_acinar      0     0  141    10      0
##   delta                                              0     0   94   160      0
##   ductal                                             0     3    3     0     61
##   gamma                                              0   529    8     2      0
##   stellate                                           0     0    3     0      7
##   Unassigned                                         2     4    2     1    125
##                                                
##                                                 endothelial epsilon gamma
##   acinar                                                  1       0     2
##   alpha                                                   0       0     0
##   alpha_ductal_beta                                       0       0     0
##   alpha_ductal_beta_gamma_stellate                        2       0     0
##   alpha_ductal_beta_stellate                              1       0     0
##   alpha_ductal_delta_beta_gamma_stellate                  0       0     0
##   alpha_ductal_delta_beta_gamma_stellate_acinar           0       0     4
##   delta                                                   0       0     0
##   ductal                                                  0       0     0
##   gamma                                                   0       3    94
##   stellate                                               14       0     0
##   Unassigned                                              3       0     1
##                                                
##                                                 stellate
##   acinar                                               0
##   alpha                                                0
##   alpha_ductal_beta                                    0
##   alpha_ductal_beta_gamma_stellate                     0
##   alpha_ductal_beta_stellate                           0
##   alpha_ductal_delta_beta_gamma_stellate               0
##   alpha_ductal_delta_beta_gamma_stellate_acinar        0
##   delta                                                0
##   ductal                                               0
##   gamma                                                0
##   stellate                                            79
##   Unassigned                                           1
df_toPlot_muraro$CHETAH_res[unlist(lapply(strsplit(as.character(df_toPlot_muraro$CHETAH_res), "_"), length))>1] <- "intermediate"
df_toPlot_muraro$CHETAH_res[df_toPlot_muraro$CHETAH_res  == "Unassigned"] <- "unassigned"
df_toPlot_muraro$CHETAH_res <- factor(df_toPlot_muraro$CHETAH_res,
                                      levels = c(unique(df_toPlot_muraro$CHETAH_res[!df_toPlot_muraro$CHETAH_res%in%c("intermediate", "unassigned")]),
                                                 "intermediate", "unassigned"))


g_CHETAH <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = CHETAH_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$CHETAH_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "CHETAH") 







scID_res <- read.csv('results/benchmarking_results/scID/wang_muraro2_scID_Pred.csv')
# scID_true <- read.csv('/Volumes/HD/iMACdropbox/Dropbox (Sydney Uni)/scclassify/scID/wang_muraro_scID_True.csv')
df_toPlot_muraro$scID_res <- scID_res$x
df_toPlot_muraro$scID_res <- as.character(df_toPlot_muraro$scID_res )
table(df_toPlot_muraro$scID_res, df_toPlot_muraro$cellTypes)
##             
##              acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   acinar        150     0    0     0      0           0       0     2        0
##   alpha           0    45    0     0      0           0       0     0        0
##   beta            0    15  398     4      0           0       0     0        0
##   delta           0     0    0   183      0           0       0     0        0
##   ductal         66     7    8     1    221           0       0     0        1
##   gamma           0     2    6     0      0           0       0    98        0
##   stellate        1     0    2     0      5          20       0     0       79
##   unassigned      2   743   34     5     19           1       3     1        0
df_toPlot_muraro$scID_res[unlist(lapply(strsplit(as.character(df_toPlot_muraro$scID_res), "_"), length))>1] <- "intermediate"
df_toPlot_muraro$scID_res[df_toPlot_muraro$scID_res  == "Unassigned"] <- "unassigned"
df_toPlot_muraro$scID_res <- factor(df_toPlot_muraro$scID_res,
                                    levels = c(unique(df_toPlot_muraro$scID_res[!df_toPlot_muraro$scID_res%in%c("intermediate", "unassigned")]),
                                               "intermediate", "unassigned"))


g_scID <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = scID_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$scID_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scID") 




Garnett_res <- read.csv('results/benchmarking_results/Garnett/result/wang_muraro2_Garnett_CV_Pred.csv')
# Garnett_true <- read.csv('/Volumes/HD/iMACdropbox/Dropbox (Sydney Uni)/scclassify/Garnett/wang_muraro_Garnett_True.csv')
df_toPlot_muraro$Garnett_res <- Garnett_res$x
df_toPlot_muraro$Garnett_res <- as.character(df_toPlot_muraro$Garnett_res )
table(df_toPlot_muraro$Garnett_res, df_toPlot_muraro$cellTypes)
##          
##           acinar alpha beta delta ductal endothelial epsilon gamma stellate
##   alpha        0     2    0     0      0           0       0     1        0
##   Unknown    219   810  448   193    245          21       3   100       80
df_toPlot_muraro$Garnett_res[df_toPlot_muraro$Garnett_res  == "Unknown"] <- "unassigned"

df_toPlot_muraro$Garnett_res <- as.factor(df_toPlot_muraro$Garnett_res )

g_Garnett <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = Garnett_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$Garnett_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Garnett") 




scmapCell_res <- read.csv('results/benchmarking_results/scmap/result/wang_muraro2_scmapcell_Pred.csv')

df_toPlot_muraro$scmapCell_res <- scmapCell_res$x
df_toPlot_muraro$scmapCell_res <- as.factor(df_toPlot_muraro$scmapCell_res )



g_scmapCell <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = scmapCell_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$scmapCell_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scmapCell") 




scmapCluster_res <- read.csv('results/benchmarking_results/scmap/result/wang_muraro2_scmapcluster_Pred.csv')

df_toPlot_muraro$scmapCluster_res <- scmapCluster_res$x
df_toPlot_muraro$scmapCluster_res <- as.factor(df_toPlot_muraro$scmapCluster_res )



g_scmapCluster <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = scmapCluster_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$scmapCluster_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scmapCluster") 



scPred_res <- read.csv('results/benchmarking_results/scPred/result/wang_muraro2_scPred_Pred.csv')
df_toPlot_muraro$scPred_res <- scPred_res$x
df_toPlot_muraro$scPred_res <- as.factor(df_toPlot_muraro$scPred_res )


g_scPred <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = scPred_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$scPred_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scPred") 



Garnett_DE_res <- read.csv('results/benchmarking_results/Garnett_DE/wang_muraro2_Garnett_DE_Pred.csv')

df_toPlot_muraro$Garnett_DE_res <- Garnett_DE_res$x

df_toPlot_muraro$Garnett_DE_res <- as.character(df_toPlot_muraro$Garnett_DE_res)
df_toPlot_muraro$Garnett_DE_res[df_toPlot_muraro$Garnett_DE_res  == "Unknown"] <- "unassigned"

df_toPlot_muraro$Garnett_DE_res <- as.factor(df_toPlot_muraro$Garnett_DE_res )


g_Garnett_DE <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = Garnett_DE_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$Garnett_DE_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Garnett_DE") 





SVM_reject_res <- read.csv('results/benchmarking_results/SVM_reject/wang_muraro2_SVMreject_pred.csv')
df_toPlot_muraro$SVM_reject_res <- SVM_reject_res$X0
df_toPlot_muraro$SVM_reject_res <- as.character(df_toPlot_muraro$SVM_reject_res)
df_toPlot_muraro$SVM_reject_res[df_toPlot_muraro$SVM_reject_res  == "Unassign"] <- "unassigned"

df_toPlot_muraro$SVM_reject_res <- as.factor(df_toPlot_muraro$SVM_reject_res)




g_SVM_reject <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = SVM_reject_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$SVM_reject_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "SVM_reject") 






Catsle_res <- read.csv('results/benchmarking_results/CaSTLe//result/wang_muraro2_Pred_CaSTLe.csv')
df_toPlot_muraro$Catsle_res <- Catsle_res$x
df_toPlot_muraro$Catsle_res <- as.factor(df_toPlot_muraro$Catsle_res )



g_Catsle <- ggplot(df_toPlot_muraro, aes(x = tSNE1, y = tSNE2, color = Catsle_res)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro$Catsle_res)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Catsle") 





pdf("figures/AppendixFigureS1_wang_muraro_tSNE_allMethods.pdf", width = 18, height = 18)
ggarrange(g_original + theme(text = element_text(size = 14)), 
          g_scClassify  + theme(text = element_text(size = 14)), 
          g_singleR + theme(text = element_text(size = 14)), 
          g_moana + theme(text = element_text(size = 14)), 
          g_singlecellNet + theme(text = element_text(size = 14)), 
          g_actinn + theme(text = element_text(size = 14)), 
          g_CHETAH + theme(text = element_text(size = 14)), 
          g_scID + theme(text = element_text(size = 14)), 
          g_Garnett + theme(text = element_text(size = 14)),
          g_Garnett_DE + theme(text = element_text(size = 14)),
          g_scmapCell + theme(text = element_text(size = 14)), 
          g_scmapCluster + theme(text = element_text(size = 14)), 
          g_scPred + theme(text = element_text(size = 14)), 
          g_SVM_reject + theme(text = element_text(size = 14)), 
          g_Catsle + theme(text = element_text(size = 14)), 
          ncol = 3, nrow = 5, align = "hv")
dev.off()
## quartz_off_screen 
##                 2

9 Post-clustering

library(limma)
doLimma <- function(exprsMat, cellTypes, exprs_pct = 0.05){
  
  cellTypes <- droplevels(as.factor(cellTypes))
  tt <- list()
  for (i in 1:nlevels(cellTypes)) {
    tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
    design <- stats::model.matrix(~tmp_celltype)
    
    
    meanExprs <- do.call(cbind, lapply(c(0,1), function(i){
      Matrix::rowMeans(exprsMat[, tmp_celltype == i, drop = FALSE])
    }))
    
    meanPct <- do.call(cbind, lapply(c(0,1), function(i){
      Matrix::rowSums(exprsMat[, tmp_celltype == i, drop = FALSE] > 0)/sum(tmp_celltype == i)
    }))
    
    keep <- meanPct[,2] > exprs_pct
    
    y <- methods::new("EList")
    y$E <- exprsMat[keep, ]
    fit <- limma::lmFit(y, design = design)
    fit <- limma::eBayes(fit, trend = TRUE, robust = TRUE)
    tt[[i]] <- limma::topTable(fit, n = Inf, adjust.method = "BH", coef = 2)
    
    
    
    if (!is.null(tt[[i]]$ID)) {
      tt[[i]] <- tt[[i]][!duplicated(tt[[i]]$ID),]
      rownames(tt[[i]]) <- tt[[i]]$ID
    }
    
    tt[[i]]$meanExprs.1 <- meanExprs[rownames(tt[[i]]), 1]
    tt[[i]]$meanExprs.2 <- meanExprs[rownames(tt[[i]]), 2]
    tt[[i]]$meanPct.1 <- meanPct[rownames(tt[[i]]), 1]
    tt[[i]]$meanPct.2 <- meanPct[rownames(tt[[i]]), 2]
  }
  
  
  
  return(tt)
  
  
}

9.1 Xin-Muraro

df_toPlot_muraro2 <- data.frame(tSNE1 = tsne_muraro2[,1], tSNE2 = tsne_muraro2[,2], 
                                cellTypes = muraro2$cellTypes,
                                scClassify = xin_ensemble_res_weights$muraro$cellTypes,
                                ensemble_scores = xin_ensemble_res_weights$muraro$scores)

df_toPlot_muraro2$scClassify <- as.character(df_toPlot_muraro2$scClassify)

df_toPlot_muraro2$scClassify[unlist(lapply(strsplit(df_toPlot_muraro2$scClassify, "_"), length))>1] <- "intermediate"

df_toPlot_muraro2$scClassify <- factor(df_toPlot_muraro2$scClassify,
                                       levels = c(unique(df_toPlot_muraro2$scClassify[!df_toPlot_muraro2$scClassify%in%c("intermediate", "unassigned")]),
                                                  "intermediate", "unassigned"))




g1 <- ggplot(df_toPlot_muraro2, aes(x = tSNE1, y = tSNE2, color = cellTypes)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Original Label by Muraro et al.")


g2 <- ggplot(df_toPlot_muraro2, aes(x = tSNE1, y = tSNE2, color = scClassify)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour[levels(df_toPlot_muraro2$scClassify)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scClassify Label (predicted by Xin et al.)") 

9.1.1 Clustering using scClust

muraro2_unassigned <- muraro2[, df_toPlot_muraro2$scClassify %in% c("unassigned")]

muraro2_unassigned <- runTSNE(muraro2_unassigned)


rowData(muraro2_unassigned)$feature_symbol <- rownames(muraro2_unassigned)

fit <- trendVar(muraro2_unassigned, parametric = TRUE, use.spikes = FALSE)
decomp <- decomposeVar(muraro2_unassigned, fit)
decomp <- decomp[order(decomp$bio, decreasing = TRUE), ]

hvg <- rownames(decomp)[decomp$FDR < 0.001 & decomp$bio > 0.1]
length(hvg)
## [1] 1406
library(scdney)
mat <- logcounts(muraro2_unassigned)[hvg,]


simlr.result <- scClust(mat, nCs = 5, 
                        similarity = "pearson", 
                        method = "simlr", seed = 1, cores.ratio = 0)
## Computing the multiple Kernels.
## Performing network diffiusion.
## Iteration:  1 
## Iteration:  2 
## Iteration:  3 
## Iteration:  4 
## Iteration:  5 
## Iteration:  6 
## Iteration:  7 
## Iteration:  8 
## Iteration:  9 
## Iteration:  10 
## Iteration:  11 
## Iteration:  12 
## Iteration:  13 
## Iteration:  14 
## Iteration:  15 
## Iteration:  16 
## Iteration:  17 
## Iteration:  18 
## Iteration:  19 
## Iteration:  20 
## Iteration:  21 
## Iteration:  22 
## Iteration:  23 
## Iteration:  24 
## Iteration:  25 
## Performing t-SNE.
## Epoch: Iteration # 100  error is:  0.09539453 
## Epoch: Iteration # 200  error is:  0.07699326 
## Epoch: Iteration # 300  error is:  0.07257045 
## Epoch: Iteration # 400  error is:  0.07026282 
## Epoch: Iteration # 500  error is:  0.06874953 
## Epoch: Iteration # 600  error is:  0.06765628 
## Epoch: Iteration # 700  error is:  0.06680778 
## Epoch: Iteration # 800  error is:  0.06613332 
## Epoch: Iteration # 900  error is:  0.06557052 
## Epoch: Iteration # 1000  error is:  0.06510032 
## Performing Kmeans.
## Performing t-SNE.
## Epoch: Iteration # 100  error is:  11.01131 
## Epoch: Iteration # 200  error is:  0.192293 
## Epoch: Iteration # 300  error is:  0.1818997 
## Epoch: Iteration # 400  error is:  0.1786703 
## Epoch: Iteration # 500  error is:  0.1767927 
## Epoch: Iteration # 600  error is:  0.1755181 
## Epoch: Iteration # 700  error is:  0.1745785 
## Epoch: Iteration # 800  error is:  0.1738567 
## Epoch: Iteration # 900  error is:  0.1732807 
## Epoch: Iteration # 1000  error is:  0.1728036
table(simlr.result$y$cluster, muraro2_unassigned$cellTypes)
##    
##     acinar alpha beta delta ductal endothelial gamma stellate
##   1      0     0    1     0    118           0     0        0
##   2    195     0    3     0      5           0     2        0
##   3      0     0    3     0     16           1     0       62
##   4      0     5    1    11      0           0     0        0
##   5      0     0    0     0      0          20     0        0
muraro2_unassigned$simlr_k5 <- as.factor(simlr.result$y$cluster)

plotTSNE(muraro2_unassigned, colour_by = "simlr_k5")

# plotTSNE(muraro2_unassigned, colour_by = "sc3_5_clusters")


#acinar: PRSS1; ductal: SPP1, KRT19; stellate: COL1A1; endothelial: ESAM; delta: SST
deList <- doLimma(logcounts(muraro2_unassigned), muraro2_unassigned$simlr_k5)
deList_filtered <- lapply(deList, function(x) {
  # x <- x[order(x$adj.P.Val, decreasing = T),]
  rownames(x)[x$adj.P.Val < 0.001 & x$logFC > 2.5][1:30]
})
# 
# deList_filtered <- lapply(deList, function(x) rownames(x[order(x$logFC, decreasing = T),])[1:20])



ind <- order(muraro2_unassigned$simlr_k5)




annotation_col <- data.frame(scClassify_postClust = as.factor(muraro2_unassigned$simlr_k5)[ind],
                             cellTypes = as.factor(muraro2_unassigned$cellTypes)[ind])
scClassify_postClust_color <- c(tableau_color_pal("Jewel Bright")(9)[c(1, 2, 3, 5, 6)])
cellTypes_color <- tableau_color_pal("Tableau 10")(9)

names(scClassify_postClust_color) <- levels(annotation_col$scClassify_postClust)
# names(cellTypes_color) <- levels(annotation_col$cellTypes)
names(cellTypes_color) <- levels(as.factor(muraro2$cellTypes))


annotation_colour <- list(scClassify_postClust = scClassify_postClust_color,
                          cellTypes = cellTypes_color
)
rownames(annotation_col) <- colnames(mat)[ind]


set.seed(12345)

pheatmap(mat[na.omit(unlist(deList_filtered[c(2, 3, 1, 5, 4)])),ind], 
         cluster_cols = TRUE, cluster_rows = FALSE,
         clustering_method = "ward.D",
         clustering_distance_cols = "correlation",
         show_colnames = FALSE,
         annotation_col = annotation_col,
         annotation_colors = annotation_colour)

muraro_postClust <- as.character(muraro2_unassigned$simlr_k5)
names(muraro_postClust) <- colnames(muraro2_unassigned)
# muraro_postClust[muraro_postClust == "4"] <- "acinar (Cluster 4)"
# muraro_postClust[muraro_postClust == "3"] <- "stellate (Cluster 3)"
# muraro_postClust[muraro_postClust == "2"] <- "ductal (Cluster 2)"
# muraro_postClust[muraro_postClust == "5"] <- "endothelial (Cluster 5)"
# muraro_postClust[muraro_postClust == "1"] <- "delta (Cluster 1)"


muraro_postClust[muraro_postClust == "2"] <- "acinar"
muraro_postClust[muraro_postClust == "3"] <- "stellate"
muraro_postClust[muraro_postClust == "1"] <- "ductal"
muraro_postClust[muraro_postClust == "5"] <- "endothelial"
muraro_postClust[muraro_postClust == "4"] <- "delta"



scClassify_postClust_color <- cellTypes_color[unique(muraro_postClust)]

annotation_colour <- list(scClassify_postClust = scClassify_postClust_color,
                          cellTypes = cellTypes_color
)
rownames(annotation_col) <- colnames(mat)[ind]


annotation_col <- data.frame(scClassify_postClust = as.factor(muraro_postClust)[ind],
                             cellTypes = as.factor(muraro2_unassigned$cellTypes)[ind])

set.seed(12345)

# pdf("figures/FigureEV4A_xin_ToMuraro_ensemble_heatmap_byCellNames", width = 10, height = 12)
pheatmap(mat[na.omit(unlist(deList_filtered[c(2, 3, 1, 5, 4)])),ind], 
         cluster_cols = TRUE, cluster_rows = FALSE,
         clustering_method = "ward.D",
         clustering_distance_cols = "correlation",
         show_colnames = FALSE,
         annotation_col = annotation_col,
         annotation_colors = annotation_colour,
         filename = "figures/FigureEV4A_xin_ToMuraro_ensemble_heatmap_byCellNames.pdf",
         width = 10, height = 12)

# dev.off()



df_toPlot_muraro2$scClassify_postClust <- as.character(df_toPlot_muraro2$scClassify)
df_toPlot_muraro2[names(muraro_postClust),]$scClassify_postClust <- muraro_postClust


g3 <- ggplot(df_toPlot_muraro2, aes(x = tSNE1, y = tSNE2, color = scClassify_postClust)) +
  geom_point() +
  scale_color_manual(values = cellTypes_muraro_colour) +
  theme_bw() +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scClassify Post-clustered Label") 


grid.arrange(g1, g2, g3, ncol = 3)

pdf("figures/Figure4A_xin_ToMuraro_ensemble_tSNE_withPost.pdf", width = 18, height = 8)
grid.arrange(g1, g2, g3, ncol = 3)
dev.off()
## pdf 
##   3
sum(df_toPlot_muraro2$cellTypes == df_toPlot_muraro2$scClassify_postClust)/length(df_toPlot_muraro2$cellTypes)
## [1] 0.8972667

9.2 Xin-Wang

set.seed(2019)
wang <- runTSNE(wang)
tsne_wang <- reducedDim(wang, "TSNE")


df_toPlot_wang <- data.frame(tSNE1 = tsne_wang[,1], tSNE2 = tsne_wang[,2], 
                             cellTypes = wang$cellTypes,
                             scClassify = xin_ensemble_res_weights$wang$cellTypes,
                             ensemble_scores = xin_ensemble_res_weights$wang$scores)

df_toPlot_wang$scClassify <- as.character(df_toPlot_wang$scClassify)

df_toPlot_wang$scClassify[unlist(lapply(strsplit(df_toPlot_wang$scClassify, "_"), length))>1] <- "intermediate"

df_toPlot_wang$scClassify <- factor(df_toPlot_wang$scClassify,
                                    levels = c(unique(df_toPlot_wang$scClassify[!df_toPlot_wang$scClassify%in%c("intermediate", "unassigned")]),
                                               "intermediate", "unassigned"))


cellTypes_wang_colour <- c(tableau_color_pal("Tableau 10")(7), tableau_color_pal("Tableau 20")(14)[c(14, 13)])
names(cellTypes_wang_colour) <- c(unique(c(names(table(xin$cellTypes)), unique(wang$cellTypes))), "intermediate", "unassigned")


g1 <- ggplot(df_toPlot_wang, aes(x = tSNE1, y = tSNE2, color = cellTypes)) +
  geom_point() +
  scale_color_manual(values = cellTypes_wang_colour) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "Original Label by Wang et al.")


g2 <- ggplot(df_toPlot_wang, aes(x = tSNE1, y = tSNE2, color = scClassify)) +
  geom_point() +
  scale_color_manual(values = cellTypes_wang_colour[levels(df_toPlot_wang$scClassify)]) +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scClassify Label (predicted by Xin et al.)") 

9.2.1 Clustering using scClust

wang_unassigned <- wang[, df_toPlot_wang$scClassify %in% c("unassigned")]

wang_unassigned <- runTSNE(wang_unassigned)


rowData(wang_unassigned)$feature_symbol <- rownames(wang_unassigned)

fit <- trendVar(wang_unassigned, parametric = TRUE, use.spikes = FALSE)
decomp <- decomposeVar(wang_unassigned, fit)
decomp <- decomp[order(decomp$bio, decreasing = TRUE), ]

hvg <- rownames(decomp)[decomp$FDR < 0.001 & decomp$bio > 1]
length(hvg)
## [1] 216
library(scdney)
mat <- logcounts(wang_unassigned)[hvg,]


simlr.result <- scClust(mat, nCs = 2, 
                        similarity = "pearson", 
                        method = "simlr", seed = 1, cores.ratio = 0)
## Computing the multiple Kernels.
## Performing network diffiusion.
## Iteration:  1 
## Iteration:  2 
## Iteration:  3 
## Iteration:  4 
## Iteration:  5 
## Iteration:  6 
## Iteration:  7 
## Iteration:  8 
## Iteration:  9 
## Iteration:  10 
## Iteration:  11 
## Iteration:  12 
## Performing t-SNE.
## Epoch: Iteration # 100  error is:  0.8030925 
## Epoch: Iteration # 200  error is:  3.003397 
## Epoch: Iteration # 300  error is:  0.8317716 
## Epoch: Iteration # 400  error is:  0.4048715 
## Epoch: Iteration # 500  error is:  0.2892251 
## Epoch: Iteration # 600  error is:  0.2329464 
## Epoch: Iteration # 700  error is:  1.019868 
## Epoch: Iteration # 800  error is:  0.4779615 
## Epoch: Iteration # 900  error is:  0.4244695 
## Epoch: Iteration # 1000  error is:  0.2678447 
## Performing Kmeans.
## Performing t-SNE.
## Epoch: Iteration # 100  error is:  18.45086 
## Epoch: Iteration # 200  error is:  1.455163 
## Epoch: Iteration # 300  error is:  0.9427795 
## Epoch: Iteration # 400  error is:  0.4704646 
## Epoch: Iteration # 500  error is:  0.4391199 
## Epoch: Iteration # 600  error is:  0.398169 
## Epoch: Iteration # 700  error is:  0.3288774 
## Epoch: Iteration # 800  error is:  0.3262553 
## Epoch: Iteration # 900  error is:  0.3103893 
## Epoch: Iteration # 1000  error is:  0.8946993
wang_unassigned$simlr_k2 <- as.factor(simlr.result$y$cluster)

plotTSNE(wang_unassigned, colour_by = "simlr_k2")

# plotTSNE(wang_unassigned, colour_by = "sc3_5_clusters")


table(wang_unassigned$simlr_k2, wang_unassigned$cellTypes)
##    
##     acinar ductal stellate
##   1      5     56        7
##   2      0      4       25
mclust::adjustedRandIndex(wang_unassigned$simlr_k2, wang_unassigned$cellTypes)
## [1] 0.4837222
#acinar: PRSS1; ductal: SPP1, KRT19; stellate: COL1A1; endothelial: ESAM; delta: SST
deList <- doLimma(mat, wang_unassigned$simlr_k2)
deList_filtered <- lapply(deList, function(x) {
  # x <- x[order(x$adj.P.Val, decreasing = T),]
  rownames(x)[x$adj.P.Val < 0.01 & x$logFC > 2.5][1:25]
})
# 



ind <- order(wang_unassigned$simlr_k2)




annotation_col <- data.frame(scClassify_postClust = as.factor(wang_unassigned$simlr_k2)[ind],
                             cellTypes = as.factor(wang_unassigned$cellTypes)[ind])
scClassify_postClust_color <- c(tableau_color_pal("Jewel Bright")(9)[c(1, 2)])


names(scClassify_postClust_color) <- levels(annotation_col$scClassify_postClust)



annotation_colour <- list(scClassify_postClust = scClassify_postClust_color,
                          cellTypes = cellTypes_wang_colour[levels(as.factor(wang$cellTypes))]
)
rownames(annotation_col) <- colnames(mat)[ind]


pheatmap(logcounts(wang_unassigned)[na.omit(unlist(deList_filtered[c(1, 2)])),ind], 
         cluster_cols = TRUE, cluster_rows = FALSE,
         clustering_method = "ward.D",
         clustering_distance_cols = "correlation",
         show_colnames = FALSE,
         annotation_col = annotation_col,
         annotation_colors = annotation_colour)

wang_postClust <- as.character(wang_unassigned$simlr_k2)
names(wang_postClust) <- colnames(wang_unassigned)


wang_postClust[wang_postClust == "1"] <- "ductal"
wang_postClust[wang_postClust == "2"] <- "stellate"




annotation_colour <- list(scClassify_postClust = cellTypes_wang_colour[levels(as.factor(wang$cellTypes))],
                          cellTypes = cellTypes_wang_colour[levels(as.factor(wang$cellTypes))]
)
rownames(annotation_col) <- colnames(mat)[ind]


annotation_col <- data.frame(scClassify_postClust = as.factor(wang_postClust)[ind],
                             cellTypes = as.factor(wang_unassigned$cellTypes)[ind])

set.seed(12345)

pheatmap(mat[na.omit(unlist(deList_filtered[c(2, 1)])),], 
         cluster_cols = TRUE, cluster_rows = FALSE,
         clustering_method = "ward.D",
         clustering_distance_cols = "correlation",
         show_colnames = FALSE,
         annotation_col = annotation_col,
         annotation_colors = annotation_colour,
         filename = "figures/FigureEV4C_xin_Towang_ensemble_heatmap_byCellNames.pdf",
         width = 10, height = 12)



df_toPlot_wang$scClassify_postClust <- as.character(df_toPlot_wang$scClassify)
df_toPlot_wang[names(wang_postClust),]$scClassify_postClust <- wang_postClust


g3 <- ggplot(df_toPlot_wang, aes(x = tSNE1, y = tSNE2, color = scClassify_postClust)) +
  geom_point() +
  scale_color_manual(values = cellTypes_wang_colour) +
  theme_bw() +
  theme_yx() +
  theme(aspect.ratio = 1, legend.position = "bottom") +
  labs(title = "scClassify Post-clustered Label") 


grid.arrange(g1, g2, g3, ncol = 3)

pdf("figures/FigureEV4B_xin_Towang_ensemble_tSNE_withPost.pdf", width = 18, height = 8)
grid.arrange(g1, g2, g3, ncol = 3)
dev.off()
## pdf 
##   3
sum(df_toPlot_wang$cellTypes == df_toPlot_wang$scClassify_postClust)/length(df_toPlot_wang$cellTypes)
## [1] 0.8702595

10 SessionInfo

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] scdney_0.1.5                limma_3.42.0               
##  [3] ggpubr_0.2.4                magrittr_1.5               
##  [5] reshape2_1.4.3              pheatmap_1.0.12            
##  [7] ggthemes_4.2.0              gridExtra_2.3              
##  [9] scran_1.13.10               scater_1.13.7              
## [11] ggplot2_3.2.1               SingleCellExperiment_1.8.0 
## [13] SummarizedExperiment_1.16.0 DelayedArray_0.12.0        
## [15] BiocParallel_1.20.0         matrixStats_0.55.0         
## [17] Biobase_2.46.0              GenomicRanges_1.38.0       
## [19] GenomeInfoDb_1.22.0         IRanges_2.20.1             
## [21] S4Vectors_0.23.25           BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3               backports_1.1.5          Hmisc_4.3-0             
##   [4] plyr_1.8.4               igraph_1.2.4.2           lazyeval_0.2.2          
##   [7] splines_3.6.1            amap_0.8-17              digest_0.6.23           
##  [10] foreach_1.4.7            htmltools_0.4.0          viridis_0.5.1           
##  [13] checkmate_1.9.4          cluster_2.1.0            doParallel_1.0.15       
##  [16] recipes_0.1.7            gower_0.2.1              colorspace_1.4-1        
##  [19] pan_1.6                  xfun_0.11                dplyr_0.8.3             
##  [22] crayon_1.3.4             RCurl_1.95-4.12          lme4_1.1-21             
##  [25] zeallot_0.1.0            survival_3.1-8           iterators_1.0.12        
##  [28] glue_1.3.1               gtable_0.3.0             ipred_0.9-9             
##  [31] zlibbioc_1.32.0          XVector_0.26.0           BiocSingular_1.2.0      
##  [34] jomo_2.6-10              abind_1.4-5              scales_1.1.0            
##  [37] mvtnorm_1.0-11           edgeR_3.28.0             Rcpp_1.0.3              
##  [40] viridisLite_0.3.0        htmlTable_1.13.2         dqrng_0.2.1             
##  [43] mclust_5.4.5             foreign_0.8-72           rsvd_1.0.2              
##  [46] Formula_1.2-3            lava_1.6.6               prodlim_2019.11.13      
##  [49] htmlwidgets_1.5.1        RColorBrewer_1.1-2       acepack_1.4.1           
##  [52] mice_3.6.0               pkgconfig_2.0.3          farver_2.0.1            
##  [55] nnet_7.3-12              locfit_1.5-9.1           caret_6.0-84            
##  [58] dynamicTreeCut_1.63-1    tidyselect_0.2.5         labeling_0.3            
##  [61] rlang_0.4.2              munsell_0.5.0            tools_3.6.1             
##  [64] generics_0.0.2           ggridges_0.5.1           broom_0.5.2             
##  [67] evaluate_0.14            stringr_1.4.0            yaml_2.2.0              
##  [70] ModelMetrics_1.2.2       knitr_1.26               randomForest_4.6-14     
##  [73] purrr_0.3.3              dendextend_1.13.2        mitml_0.3-7             
##  [76] nlme_3.1-141             compiler_3.6.1           rstudioapi_0.10         
##  [79] beeswarm_0.2.3           e1071_1.7-3              ggsignif_0.6.0          
##  [82] tibble_2.1.3             statmod_1.4.32           DescTools_0.99.31       
##  [85] stringi_1.4.3            lattice_0.20-38          Matrix_1.2-18           
##  [88] nloptr_1.2.1             vctrs_0.2.0              pillar_1.4.2            
##  [91] lifecycle_0.1.0          BiocNeighbors_1.4.1      data.table_1.12.6       
##  [94] cowplot_1.0.0            bitops_1.0-6             irlba_2.3.3             
##  [97] R6_2.4.1                 latticeExtra_0.6-28      vipor_0.4.5             
## [100] codetools_0.2-16         boot_1.3-23              MASS_7.3-51.4           
## [103] assertthat_0.2.1         MAST_1.12.0              withr_2.1.2             
## [106] GenomeInfoDbData_1.2.2   expm_0.999-4             doSNOW_1.0.18           
## [109] grid_3.6.1               rpart_4.1-15             timeDate_3043.102       
## [112] tidyr_1.0.0              class_7.3-15             minqa_1.2.4             
## [115] rmarkdown_1.18           DelayedMatrixStats_1.8.0 Rtsne_0.15              
## [118] clusteval_0.1            lubridate_1.7.4          base64enc_0.1-3         
## [121] ggbeeswarm_0.6.0